{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "59d5ff3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('extra-emilia-mandarin.json') as fopen:\n",
    "    data = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f74d405a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import random\n",
    "\n",
    "questions = ['can you describe the audio', 'explain about the audio']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "278c18ee",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'audio_filename': 'ZH/ZH_B00001_S02523_W000044.mp3',\n",
       " 'transcript_whisper': '但是，当变化威胁现状时，国家就会介入，以恢复秩序。'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "707274d9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(155336, 137313)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "len(glob('vllm-mandarin-description/*')), len(glob('vllm-mandarin-description-v2/*'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "1209e2fc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████| 868185/868185 [00:14<00:00, 61260.76it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "selected = []\n",
    "for i in tqdm(range(len(data))):\n",
    "    \n",
    "    d = None\n",
    "    \n",
    "    try:\n",
    "        filename = os.path.join('vllm-mandarin-description', f'{i}.json')\n",
    "        with open(filename) as fopen:\n",
    "            d = json.load(fopen)\n",
    "    except Exception as e:\n",
    "        pass\n",
    "        \n",
    "    try:\n",
    "        filename = os.path.join('vllm-mandarin-description-v2', f'{i}.json')\n",
    "        with open(filename) as fopen:\n",
    "            d = json.load(fopen)\n",
    "    except Exception as e:\n",
    "        pass\n",
    "        \n",
    "    if d is None:\n",
    "        continue\n",
    "        \n",
    "    d = d.replace('<END>', '').replace('<END', '').strip()\n",
    "    \n",
    "    selected.append({\n",
    "        'question': random.choice(questions),\n",
    "        'answer': d,\n",
    "        'metadata': json.dumps(data[i]),\n",
    "        'audio_filename': data[i]['audio_filename'],\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "d2ebc398",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "292649"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(selected)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "23f1915c",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': 'explain about the audio',\n",
       " 'answer': 'This statement is a concise and philosophical reflection on the concept of leveraging external resources or tools to maximize one\\'s own capabilities. Let\\'s break it down:\\n\\n1. **借** (jiè) - To borrow or utilize.\\n2. **助** (zhù) - Assistance or help.\\n3. **外物** (wàiwù) - External objects or resources.\\n4. **的力量** (de lìliang) - The power or strength.\\n5. **使** (shǐ) - To make or cause.\\n6. **自己的** (zìjǐ de) - One\\'s own.\\n7. **力量** (lìliang) - Strength or power.\\n8. **得到** (dé dào) - To obtain or achieve.\\n9. **最大** (zuì dà) - Maximum.\\n10. **程度** (chéngdù) - Degree or extent.\\n11. **的** (de) - Of or -\\'s (possessive).\\n12. **发挥** (fāhuī) - To bring into play or utilize effectively.\\n\\n### Translation:\\n\"By leveraging the power of external resources, one can maximize their own strength.\"\\n\\n### Explanation:\\n- **Leveraging External Resources**: The phrase \"借助外物的力量\" suggests using tools, technology, or assistance from others to enhance one\\'s abilities. This can be applied in various contexts, such as using software to improve productivity, seeking mentorship to gain new skills, or utilizing physical tools to complete tasks more efficiently.\\n- **Maximizing One\\'s Own Strength**: The idea here is that by integrating these external resources, one can achieve their full potential. It implies that individual effort alone may not be sufficient to reach the highest levels of performance, and that strategic use of external aids can significantly amplify one\\'s capabilities.\\n\\nThis statement reflects a pragmatic and strategic approach to personal and professional development. It encourages individuals to recognize and utilize the resources available to them to achieve their goals more effectively. The philosophy is often seen in fields like business, technology, and personal growth, where collaboration and the use of advanced tools are essential for success.',\n",
       " 'metadata': '{\"audio_filename\": \"ZH/ZH_B00002_S03424_W000005.mp3\", \"transcript_whisper\": \"\\\\u501f\\\\u52a9\\\\u5916\\\\u7269\\\\u7684\\\\u529b\\\\u91cf\\\\uff0c\\\\u4f7f\\\\u81ea\\\\u5df1\\\\u7684\\\\u529b\\\\u91cf\\\\u5f97\\\\u5230\\\\u6700\\\\u5927\\\\u7a0b\\\\u5ea6\\\\u7684\\\\u53d1\\\\u6325\\\\u3002\"}',\n",
       " 'audio_filename': 'ZH/ZH_B00002_S03424_W000005.mp3'}"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "b64c8695",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "dataset = Dataset.from_list(selected)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "16b16711",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7da09866e6d04f5985ea8546d291c5db",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bbc7c5331f324585afa228103fb44528",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/147 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d87b1f8bf26f405a92b79697d79c3dbf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/159M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f496687ff3ec4405b478ae8151646536",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/147 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0e770feaa38c468f90c6397e48622055",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/160M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7a6e87b0e5354a4b87181aea4d96a90b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/984 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Emilia-Mandarin-Description-Instructions/commit/82658fd748a972070a1387c70fc336f17eba188e', commit_message='Upload dataset', commit_description='', oid='82658fd748a972070a1387c70fc336f17eba188e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Emilia-Mandarin-Description-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Emilia-Mandarin-Description-Instructions'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.push_to_hub('mesolitica/Emilia-Mandarin-Description-Instructions')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
